<?php
/**
 * This module implements a VERY limited parser that finds <link> tags
 * in the head of HTML or XHTML documents and parses out their
 * attributes according to the OpenID spec. It is a liberal parser,
 * but it requires these things from the data in order to work:
 *
 * - There must be an open <html> tag
 *
 * - There must be an open <head> tag inside of the <html> tag
 *
 * - Only <link>s that are found inside of the <head> tag are parsed
 *   (this is by design)
 *
 * - The parser follows the OpenID specification in resolving the
 *   attributes of the link tags. This means that the attributes DO
 *   NOT get resolved as they would by an XML or HTML parser. In
 *   particular, only certain entities get replaced, and href
 *   attributes do not get resolved relative to a base URL.
 *
 * From http://openid.net/specs.bml:
 *
 * - The openid.server URL MUST be an absolute URL. OpenID consumers
 *   MUST NOT attempt to resolve relative URLs.
 *
 * - The openid.server URL MUST NOT include entities other than &amp;,
 *   &lt;, &gt;, and &quot;.
 *
 * The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds
 * of quoting are allowed for attributes.
 *
 * The parser deals with invalid markup in these ways:
 *
 * - Tag names are not case-sensitive
 *
 * - The <html> tag is accepted even when it is not at the top level
 *
 * - The <head> tag is accepted even when it is not a direct child of
 *   the <html> tag, but a <html> tag must be an ancestor of the
 *   <head> tag
 *
 * - <link> tags are accepted even when they are not direct children
 *   of the <head> tag, but a <head> tag must be an ancestor of the
 *   <link> tag
 *
 * - If there is no closing tag for an open <html> or <head> tag, the
 *   remainder of the document is viewed as being inside of the
 *   tag. If there is no closing tag for a <link> tag, the link tag is
 *   treated as a short tag. Exceptions to this rule are that <html>
 *   closes <html> and <body> or <head> closes <head>
 *
 * - Attributes of the <link> tag are not required to be quoted.
 *
 * - In the case of duplicated attribute names, the attribute coming
 *   last in the tag will be the value returned.
 *
 * - Any text that does not parse as an attribute within a link tag
 *   will be ignored. (e.g. <link pumpkin rel='openid.server' /> will
 *   ignore pumpkin)
 *
 * - If there are more than one <html> or <head> tag, the parser only
 *   looks inside of the first one.
 *
 * - The contents of <script> tags are ignored entirely, except
 *   unclosed <script> tags. Unclosed <script> tags are ignored.
 *
 * - Any other invalid markup is ignored, including unclosed SGML
 *   comments and unclosed <![CDATA[blocks.
 *
 * PHP versions 4 and 5
 *
 * LICENSE: See the COPYING file included in this distribution.
 *
 * @access private
 * @package OpenID
 * @author JanRain, Inc. <openid@janrain.com>
 * @copyright 2005-2008 Janrain, Inc.
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache
 */

/**
 * Require Auth_OpenID::arrayGet().
 */
require_once "Auth/OpenID.php";

class Auth_OpenID_Parse {
  
  /**
   * Specify some flags for use with regex matching.
   */
  var $_re_flags = "si";
  
  /**
   * Stuff to remove before we start looking for tags
   */
  var $_removed_re = "<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>";
  
  /**
   * Starts with the tag name at a word boundary, where the tag name
   * is not a namespace
   */
  var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*?)(?:<\/?%s\s*>|\Z))";
  
  var $_attr_find = '\b(\w+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)';
  
  var $_open_tag_expr = "<%s\b";
  var $_close_tag_expr = "<((\/%s\b)|(%s[^>\/]*\/))>";

  function Auth_OpenID_Parse() {
    $this->_link_find = sprintf("/<link\b(?!:)([^>]*)(?!<)>/%s", $this->_re_flags);
    
    $this->_entity_replacements = array('amp' => '&', 'lt' => '<', 'gt' => '>', 'quot' => '"');
    
    $this->_attr_find = sprintf("/%s/%s", $this->_attr_find, $this->_re_flags);
    
    $this->_removed_re = sprintf("/%s/%s", $this->_removed_re, $this->_re_flags);
    
    $this->_ent_replace = sprintf("&(%s);", implode("|", $this->_entity_replacements));
  }

  /**
   * Returns a regular expression that will match a given tag in an
   * SGML string.
   */
  function tagMatcher($tag_name, $close_tags = null) {
    $expr = $this->_tag_expr;
    
    if ($close_tags) {
      $options = implode("|", array_merge(array($tag_name), $close_tags));
      $closer = sprintf("(?:%s)", $options);
    } else {
      $closer = $tag_name;
    }
    
    $expr = sprintf($expr, $tag_name, $closer);
    return sprintf("/%s/%s", $expr, $this->_re_flags);
  }

  function openTag($tag_name) {
    $expr = sprintf($this->_open_tag_expr, $tag_name);
    return sprintf("/%s/%s", $expr, $this->_re_flags);
  }

  function closeTag($tag_name) {
    $expr = sprintf($this->_close_tag_expr, $tag_name, $tag_name);
    return sprintf("/%s/%s", $expr, $this->_re_flags);
  }

  function htmlBegin($s) {
    $matches = array();
    $result = preg_match($this->openTag('html'), $s, $matches, PREG_OFFSET_CAPTURE);
    if ($result === false || ! $matches) {
      return false;
    }
    // Return the offset of the first match.
    return $matches[0][1];
  }

  function htmlEnd($s) {
    $matches = array();
    $result = preg_match($this->closeTag('html'), $s, $matches, PREG_OFFSET_CAPTURE);
    if ($result === false || ! $matches) {
      return false;
    }
    // Return the offset of the first match.
    return $matches[count($matches) - 1][1];
  }

  function headFind() {
    return $this->tagMatcher('head', array('body', 'html'));
  }

  function replaceEntities($str) {
    foreach ($this->_entity_replacements as $old => $new) {
      $str = preg_replace(sprintf("/&%s;/", $old), $new, $str);
    }
    return $str;
  }

  function removeQuotes($str) {
    $matches = array();
    $double = '/^"(.*)"$/';
    $single = "/^\'(.*)\'$/";
    
    if (preg_match($double, $str, $matches)) {
      return $matches[1];
    } else if (preg_match($single, $str, $matches)) {
      return $matches[1];
    } else {
      return $str;
    }
  }

  /**
   * Find all link tags in a string representing a HTML document and
   * return a list of their attributes.
   *
   * @param string $html The text to parse
   * @return array $list An array of arrays of attributes, one for each
   * link tag
   */
  function parseLinkAttrs($html) {
    $stripped = preg_replace($this->_removed_re, "", $html);
    
    $html_begin = $this->htmlBegin($stripped);
    $html_end = $this->htmlEnd($stripped);
    
    if ($html_begin === false) {
      return array();
    }
    
    if ($html_end === false) {
      $html_end = strlen($stripped);
    }
    
    $stripped = substr($stripped, $html_begin, $html_end - $html_begin);
    
    // Try to find the <HEAD> tag.
    $head_re = $this->headFind();
    $head_matches = array();
    if (! preg_match($head_re, $stripped, $head_matches)) {
      return array();
    }
    
    $link_data = array();
    $link_matches = array();
    
    if (! preg_match_all($this->_link_find, $head_matches[0], $link_matches)) {
      return array();
    }
    
    foreach ($link_matches[0] as $link) {
      $attr_matches = array();
      preg_match_all($this->_attr_find, $link, $attr_matches);
      $link_attrs = array();
      foreach ($attr_matches[0] as $index => $full_match) {
        $name = $attr_matches[1][$index];
        $value = $this->replaceEntities($this->removeQuotes($attr_matches[2][$index]));
        
        $link_attrs[strtolower($name)] = $value;
      }
      $link_data[] = $link_attrs;
    }
    
    return $link_data;
  }

  function relMatches($rel_attr, $target_rel) {
    // Does this target_rel appear in the rel_str?
    // XXX: TESTME
    $rels = preg_split("/\s+/", trim($rel_attr));
    foreach ($rels as $rel) {
      $rel = strtolower($rel);
      if ($rel == $target_rel) {
        return 1;
      }
    }
    
    return 0;
  }

  function linkHasRel($link_attrs, $target_rel) {
    // Does this link have target_rel as a relationship?
    // XXX: TESTME
    $rel_attr = Auth_OpeniD::arrayGet($link_attrs, 'rel', null);
    return ($rel_attr && $this->relMatches($rel_attr, $target_rel));
  }

  function findLinksRel($link_attrs_list, $target_rel) {
    // Filter the list of link attributes on whether it has
    // target_rel as a relationship.
    // XXX: TESTME
    $result = array();
    foreach ($link_attrs_list as $attr) {
      if ($this->linkHasRel($attr, $target_rel)) {
        $result[] = $attr;
      }
    }
    
    return $result;
  }

  function findFirstHref($link_attrs_list, $target_rel) {
    // Return the value of the href attribute for the first link
    // tag in the list that has target_rel as a relationship.
    // XXX: TESTME
    $matches = $this->findLinksRel($link_attrs_list, $target_rel);
    if (! $matches) {
      return null;
    }
    $first = $matches[0];
    return Auth_OpenID::arrayGet($first, 'href', null);
  }
}

function Auth_OpenID_legacy_discover($html_text, $server_rel, $delegate_rel) {
  $p = new Auth_OpenID_Parse();
  
  $link_attrs = $p->parseLinkAttrs($html_text);
  
  $server_url = $p->findFirstHref($link_attrs, $server_rel);
  
  if ($server_url === null) {
    return false;
  } else {
    $delegate_url = $p->findFirstHref($link_attrs, $delegate_rel);
    return array($delegate_url, $server_url);
  }
}
